Question 4

w <- read.csv("./worldhappiness2019.csv")
names(w)<- c( "Rank", "Country", "Score", "GDP", "Family",
              "Life", "Freedom", "Generosity", "Trust")
rownames(w)<- w$Country
wscores <- w[,-(1:3)]
correlations <- cor(wscores)
correlations
##                    GDP      Family        Life   Freedom  Generosity     Trust
## GDP         1.00000000  0.75490573  0.83546212 0.3790791 -0.07966231 0.2989198
## Family      0.75490573  1.00000000  0.71900946 0.4473332 -0.04812645 0.1818995
## Life        0.83546212  0.71900946  1.00000000 0.3903948 -0.02951086 0.2952828
## Freedom     0.37907907  0.44733316  0.39039478 1.0000000  0.26974181 0.4388433
## Generosity -0.07966231 -0.04812645 -0.02951086 0.2697418  1.00000000 0.3265375
## Trust       0.29891985  0.18189946  0.29528281 0.4388433  0.32653754 1.0000000
correlations[lower.tri(correlations, diag = TRUE)] = -2
map(1:2, (function(n)
  colnames(correlations)[which.max(apply(correlations, n, max))]))
## [[1]]
## [1] "GDP"
## 
## [[2]]
## [1] "Life"
subset <- wscores[c("GDP", "Life")]
plot(subset)

rownames(subset)[which.min(unlist(subset["Life"]))]
## [1] "Swaziland"

The pair of variables with the highest correlation are GDP per capita and healthy life expectancy.

The outlying country on this graph is Swaziland.

dm <- as.matrix(dist(wscores, method = "euclidean"))
d <- dist(wscores, method = "euclidean")
h <- hclust(d, "average")
d1 <- as.dendrogram(h)
suppressMessages(library(dendextend))
d2 <- color_branches(d1, k = 4, col = c(2, 3, 5, 4))
par(cex=0.5)
plot(d2)

There is an extreme outlier country, the Central African Republic, while Morocco, Georgia, Albania and Iran are also outliers to a smaller extent.

source("./h1code.R")
clust3 <- cutree(h, 3)
table(clust3)
## clust3
##   1   2   3 
## 111  44   1
clust3[c("Ireland", "India")]
## Ireland   India 
##       1       2
partition <- sumPartition(wscores, clust3)
## Final Partition
## 
## Number of clusters  3
## 
##           N.obs Within.clus.SS Ave.dist..Centroid Max.dist.centroid
## Cluster 1   111      16.679440          0.3618240         0.7628128
## Cluster 2    44       6.145732          0.3505806         0.6628750
## Cluster 3     1       0.000000          0.0000000         0.0000000
## 
## 
## Cluster centroids
## 
##            Cluster 1 Cluster 2 Cluster 3 Grand centrd
## GDP        1.11018   0.4078864 0.026     0.9051474   
## Family     1.34836   0.88425   0         1.208814    
## Life       0.8471712 0.43175   0.105     0.7252436   
## Freedom    0.4175405 0.3333864 0.225     0.3925705   
## Generosity 0.1738829 0.2113636 0.235     0.1848462   
## Trust      0.1148649 0.1015682 0.035     0.1106026   
## 
## 
## Distances between Cluster centroids
## 
##           Cluster 1 Cluster 2 Cluster 3
## Cluster 1 0.0000000 0.9433204  1.895132
## Cluster 2 0.9433204 0.0000000  1.025299
## Cluster 3 1.8951322 1.0252992  0.000000
which.max(partition$centroids["Family", ])
## Cluster 1 
##         1

The cluster sizes are 111, 44 and 1 (let’s call these clusters 1, 2 and 3 respectively). Ireland belongs to cluster 1. India belongs to cluster 2. Cluster 1 has the highest Family score. Cluster 1 has the highest scores overall.

wscoresPartitioned1 <- wscores
wscoresPartitioned1$Cluster <- as.factor(clust3)
ggparcoord(
  wscoresPartitioned1,
  columns = 1:6,
  groupColumn = "Cluster",
  scale = "uniminmax"
) + xlab("Scoring Category") + ylab("Score")

The lowest scoring cluster is unusual becuase it has a rather high Freedom and Genoristy value despite otherwise being quite different in value to other clusters on the plot.

set.seed(123)
km <- kmeans(wscores, nstart = 10, centers = 3)
sumPartition(wscores, km$cluster)
## Final Partition
## 
## Number of clusters  3
## 
##           N.obs Within.clus.SS Ave.dist..Centroid Max.dist.centroid
## Cluster 1    48       3.261740          0.2499082         0.4228718
## Cluster 2    64       6.545869          0.3021908         0.6165757
## Cluster 3    44       6.909874          0.3640020         0.9934354
## 
## 
## Cluster centroids
## 
##            Cluster 1  Cluster 2  Cluster 3 Grand centrd
## GDP        1.3341875 0.93682813 0.39102273    0.9051474
## Family     1.4569375 1.26125000 0.86186364    1.2088141
## Life       0.9596458 0.76050000 0.41825000    0.7252436
## Freedom    0.4763542 0.38393750 0.31372727    0.3925705
## Generosity 0.2004375 0.15737500 0.20779545    0.1848462
## Trust      0.1739583 0.07090625 0.09922727    0.1106026
## 
## 
## Distances between Cluster centroids
## 
##           Cluster 1 Cluster 2 Cluster 3
## Cluster 1 0.0000000 0.5068156 1.2525448
## Cluster 2 0.5068156 0.0000000 0.7634277
## Cluster 3 1.2525448 0.7634277 0.0000000
## $withinSS
##        1        2        3 
## 3.261740 6.545869 6.909874 
## 
## $aveD
##         1         2         3 
## 0.2499082 0.3021908 0.3640020 
## 
## $centroids
##            Cluster 1  Cluster 2  Cluster 3 Grand centrd
## GDP        1.3341875 0.93682813 0.39102273    0.9051474
## Family     1.4569375 1.26125000 0.86186364    1.2088141
## Life       0.9596458 0.76050000 0.41825000    0.7252436
## Freedom    0.4763542 0.38393750 0.31372727    0.3925705
## Generosity 0.2004375 0.15737500 0.20779545    0.1848462
## Trust      0.1739583 0.07090625 0.09922727    0.1106026
## 
## $centroidDist
##           Cluster 1 Cluster 2 Cluster 3
## Cluster 1 0.0000000 0.5068156 1.2525448
## Cluster 2 0.5068156 0.0000000 0.7634277
## Cluster 3 1.2525448 0.7634277 0.0000000

The outlier countries here are the Central African Republic, Myanmar and Indonesia.

wscoresPartitioned2 <- wscores
wscoresPartitioned2$Cluster <- as.factor(km$cluster)
ggparcoord(wscoresPartitioned2,
           groupColumn = "Cluster",
           scale = "uniminmax",
           columns = 1:6)

Cluster 1 has the highest Family score. Cluster 1 has the highest scores overall.

w$Cluster <- as.factor(km$cluster)
ggplot(w, aes(x = Cluster, y = Score)) + geom_boxplot()

The scores are clearly quite well clustered and quite distinct, with little overlap between the different clusters. This show we’ve effectively clustered the data.